library(here)## here() starts at C:/Users/batzdova/Desktop/EC-Web-Scrapping-and-Text-Mining
library(tidyverse)## Warning: Paket 'tidyverse' wurde unter R Version 4.1.3 erstellt
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.4.0 v purrr 1.0.1
## v tibble 3.1.8 v dplyr 1.0.10
## v tidyr 1.2.1 v stringr 1.5.0
## v readr 2.1.3 v forcats 0.5.2
## Warning: Paket 'ggplot2' wurde unter R Version 4.1.3 erstellt
## Warning: Paket 'tibble' wurde unter R Version 4.1.3 erstellt
## Warning: Paket 'tidyr' wurde unter R Version 4.1.3 erstellt
## Warning: Paket 'readr' wurde unter R Version 4.1.3 erstellt
## Warning: Paket 'purrr' wurde unter R Version 4.1.3 erstellt
## Warning: Paket 'dplyr' wurde unter R Version 4.1.3 erstellt
## Warning: Paket 'stringr' wurde unter R Version 4.1.3 erstellt
## Warning: Paket 'forcats' wurde unter R Version 4.1.3 erstellt
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
here()## [1] "C:/Users/batzdova/Desktop/EC-Web-Scrapping-and-Text-Mining"
eda <- readRDS("three_submission.rds") Number of submissions per organisation type for three consultation rounds
eda %>%
filter(size != "Missing") %>%
count(type,consult_round, size, sort = TRUE) %>%
mutate(size = fct_reorder(size, n)) %>%
ggplot(aes(x = type, y = n, fill = consult_round))+
geom_boxplot() +
coord_flip()library(see)
eda %>%
filter(size != "Missing") %>%
count(type, time, country, consult_round, sort = TRUE) %>%
mutate(country= fct_reorder(country, n)) %>%
ggplot(aes(x = time, y = n, fill = consult_round))+
geom_violindot(fill_dots = "black") +
theme_modern()+
scale_fill_material_d()+
coord_flip()During the three submission rounds which countries and size of organisations submitted most often
eda %>%
group_by(size, country, consult_round) %>%
filter(size != "Missing") %>%
mutate(count = n()) %>%
ggplot(aes(x = country, y = count, colour = size , group = country )) +
geom_count() +
coord_flip()+
# geom_line() +
facet_wrap(~consult_round) +
theme_bw() +
labs(y = "submission count", x = "Organizatin size")+
theme (legend.position = "right") # ggsave("featuregraph2.png",b, width = 9, height = 6, units = "in")eda %>%
group_by(type) %>%
filter(country != "Missing", size != "Missing") %>%
mutate(count = n()) %>%
ggplot(aes(x = type, y = count, colour = size , group = size )) +
coord_flip()+
geom_count() +
# geom_line() +
facet_wrap(~ country) +
theme_bw() +
labs(y = "submission count", x = "Type of Organisation")#+
# theme (legend.position = "none")
#ggsave("featuregraph.png",c, width = 9, height = 6, units = "in")eda %>%
count(origin = fct_lump(country, n = 5 ) )library(tidytext)## Warning: Paket 'tidytext' wurde unter R Version 4.1.3 erstellt
eda %>%
count(country = fct_lump(country , 9),type) %>%
filter(country != "Other", country != "Missing") %>%
# mutate(type= reorder_within(type, n, country)) %>% #item we want to reorder, what to reorder by, the groups we want to reorder within
mutate(type = fct_reorder(type,n)) %>%
ggplot(aes(x = n, y = country)) +
geom_col() +
scale_y_reordered() +
facet_wrap(vars(type))+
labs(y = "Countries",
x = "Number of submissions",
title = "Which organisations from which countries submitted most often?")eda %>%
count(country = fct_lump(country , 9),type) %>%
filter(country != "Other", country != "Missing", type != "Other") %>%
# mutate(type= reorder_within(type, n, country)) %>% #item we want to reorder, what to reorder by, the groups we want to reorder within
mutate(type = fct_reorder(type,n)) %>%
ggplot(aes(x = n, y = type)) +
geom_col() +
scale_y_reordered() +
facet_wrap(vars(country))+
labs(y = "Interest Groups",
x = "Number of submissions",
title = "Which organisations from which countries submitted most often?")eda %>%
filter(!is.na(country)) %>%
count(initiatives = fct_lump(country, 6)) %>%
mutate(initiatives = fct_reorder(initiatives, n)) %>%
ggplot(aes(x = n, y = initiatives)) +
geom_col()#library(codebook)
#codebook_data <- detect_missing(submission,
# only_labelled = TRUE, # only labelled values are autodetected as
# missing
# negative_values_are_missing = FALSE, # negative values are NOT missing values
# ninety_nine_problems = TRUE, # 99/999 are missing values, if they
# are more than 5 MAD from the median
# )
#codebook(codebook_data)
#library(dataMaid)
#makeCodebook(submission, reportTitle = "ec crawling")summary(table(eda$type, eda$size)) # chi-square test## Number of cases in table: 1065
## Number of factors: 2
## Test for independence of all factors:
## Chisq = NaN, df = 44, p-value = NA
## Chi-squared approximation may be incorrect
token <- eda %>%
select(id, text, time, type, org, size,
country, consult_round) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words, by = "word")token %>%
group_by(word, type, country) %>%
count(word, sort = TRUE) %>%
filter( n > 200 ) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, group = type)) +
geom_col() +
labs(y = NULL)#token %>%
# count(word, sort = TRUE) %>%
# filter(word!= "de", word != "nand", word != "nthe", word!= "article", word != "human") %>%
# mutate(word = reorder(word, n)) %>%
# ggplot(aes(n, word)) +
# geom_col() +
#labs(y = NULL)library(textdata)## Warning: Paket 'textdata' wurde unter R Version 4.1.3 erstellt
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
token %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)## Joining, by = "word"
library(tidyr)
#token$id <- readr::parse_number(token$id)
sentiment <- token %>%
inner_join(get_sentiments("bing")) %>%
count(type, sentiment, consult_round, country) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative) %>%
filter(type != "Missing")## Joining, by = "word"
# index = id %/% 80,Bing Sentiment-Scores for each Country
library(ggplot2)
ggplot(sentiment, aes(consult_round, sentiment, fill =consult_round)) +
geom_boxplot(show.legend = FALSE) +
coord_flip() +
facet_wrap(vars(country))+
labs(x = "Organization Types",
y = "Bing Sentiment Scores") #ggsave("featuregraph.png",c, width = 9, height = 6, units = "in")bing_word_counts <- token %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()## Joining, by = "word"
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL) #ggsave("featuregraph.png",c, width = 9, height = 6, units = "in")library(wordcloud)## Lade nötiges Paket: RColorBrewer
## Warning: Paket 'RColorBrewer' wurde unter R Version 4.1.3 erstellt
token %>%
anti_join(stop_words) %>%
count(word) %>%
filter(word!= "de", word != "nand", word != "nthe", word!= "article", word != "human",word != "nai",
word != "la", word != "des", word != "nof", word != "na", word != "nto", word != "en",
word != "xa0", word != "und", word != "nthis", word != "annex4", word != "ánational",
word != "annex", word != "3", word != "1", word != "2021", word != "e.g.", word != "nfor") %>%
with(wordcloud(word, n, max.words = 100))## Joining, by = "word"
library(reshape2)##
## Attache Paket: 'reshape2'
## Das folgende Objekt ist maskiert 'package:tidyr':
##
## smiths
token %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 95)## Joining, by = "word"
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- token %>%
group_by(type, country, consult_round) %>%
summarize(words = n())## `summarise()` has grouped output by 'type', 'country'. You can override using
## the `.groups` argument.
token %>%
semi_join(bingnegative) %>%
group_by(type, country) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("type", "country")) %>%
mutate(ratio = negativewords/words) %>%
filter(type != 0) %>%
slice_max(ratio, n = 1) %>%
ungroup()## Joining, by = "word"
## `summarise()` has grouped output by 'type'. You can override using the
## `.groups` argument.
token %>%
semi_join(bingnegative) %>%
group_by(type, country) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("type", "country")) %>%
mutate(ratio = negativewords/words) %>%
filter(type != "Missing") %>%
slice_max(ratio, n = 1) %>%
ungroup() %>%
ggplot(aes(ratio, type)) +
geom_col(show.legend = TRUE) ## Joining, by = "word"
## `summarise()` has grouped output by 'type'. You can override using the
## `.groups` argument.
token %>%
semi_join(bingnegative) %>%
group_by(type, country) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("type", "country")) %>%
mutate(ratio = negativewords/words) %>%
filter(type != "Missing") %>%
slice_max(ratio, n = 1) %>%
ungroup() %>%
ggplot(aes(ratio, country, group = type, color = consult_round)) +
geom_point(show.legend = TRUE) ## Joining, by = "word"
## `summarise()` has grouped output by 'type'. You can override using the
## `.groups` argument.
ec_bigrams <- eda %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
filter(!is.na(bigram))
ec_bigrams %>% count(bigram, sort=TRUE)library(tidyr)
bigrams_separated <- ec_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigram_countsbigrams_united <- bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")
bigrams_united